knitr::opts_chunk$set(echo = TRUE)
library("infer")      #tools to easily compute confidence intervals
library("moderndive") #case study example
library("readxl")     #to load Excel spreadsheets
library("tidyverse")  #suite of useful data analysis packages
## Warning: package 'tibble' was built under R version 4.2.3
## Warning: package 'dplyr' was built under R version 4.2.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.1     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
outlier_remover <- function(numerical_list){
  # This function will brashly remove outliers from a numerical list
  # by applying a mean plus/minus 2 standard deviations
  # range-rule-of-thumb interval
  # INPUT: a column of numerical data (missing values are okay)
  # OUTPUT: the numerical list, but with outliers replaced with "NA"
  
  # compute sample mean
  xbar <- mean(numerical_list,  na.rm = TRUE)
  
  # compute sample standard deviation
  s <- sd(numerical_list, na.rm = TRUE)
  
  # nullify outliers
  numerical_list[numerical_list < xbar - 2*s] <- NA
  numerical_list[numerical_list > xbar + 2*s] <- NA
  
  # return
  numerical_list
}

outlier_remover2 <- function(x){
  # This function will remove possible outliers in a vector
  # idea: outliers are in the extreme 10 percent of values
  # INPUT: data vector
  # OUTPUT: revised data vector without outliers
  
  # xbar <- mean(x, na.rm = TRUE)
  # s <- sd(x, na.rm = TRUE)
  p10 <- quantile(x, probs = 0.10, na.rm = TRUE)
  p90 <- quantile(x, probs = 0.90, na.rm = TRUE)
  n <- length(x)
  new_x <- x
  
  for(i in 1:n){
    if(is.na(x[i])){
      new_x[i] <- NA
    } else {
      if(x[i] >= p90){ new_x[i] <- NA }
      if(x[i] <= p10){ new_x[i] <- NA }
    }
  }
  
  #return
  new_x
}

Case Study

After each code block, to the best of your current ability, describe what the R code is doing.

This material comes from Chapter 8 of Statistical Inference via Data Science: A ModernDive into R and the Tidyverse

“Let’s apply our knowledge of confidence intervals to answer the question: “Is yawning contagious?”. If you see someone else yawn, are you more likely to yawn? In an episode of the US show Mythbusters, the hosts conducted an experiment to answer this question.”

“Fifty adult participants who thought they were being considered for an appearance on the show were interviewed by a show recruiter. In the interview, the recruiter either yawned or did not. Participants then sat by themselves in a large van and were asked to wait. While in the van, the Mythbusters team watched the participants using a hidden camera to see if they yawned. The data frame containing the results of their experiment is available in the mythbusters_yawn data frame included in the moderndive package:”

mythbusters_yawn
## # A tibble: 50 × 3
##     subj group   yawn 
##    <int> <chr>   <chr>
##  1     1 seed    yes  
##  2     2 control yes  
##  3     3 seed    no   
##  4     4 seed    yes  
##  5     5 seed    no   
##  6     6 control no   
##  7     7 seed    yes  
##  8     8 control no   
##  9     9 control no   
## 10    10 seed    no   
## # ℹ 40 more rows
mythbusters_yawn %>% 
  group_by(group, yawn) %>% 
  summarize(count = n())
## `summarise()` has grouped output by 'group'. You can override using the
## `.groups` argument.
## # A tibble: 4 × 3
## # Groups:   group [2]
##   group   yawn  count
##   <chr>   <chr> <int>
## 1 control no       12
## 2 control yes       4
## 3 seed    no       24
## 4 seed    yes      10
mythbusters_yawn %>% 
  specify(formula = yawn ~ group, success = "yes")
## Response: yawn (factor)
## Explanatory: group (factor)
## # A tibble: 50 × 2
##    yawn  group  
##    <fct> <fct>  
##  1 yes   seed   
##  2 yes   control
##  3 no    seed   
##  4 yes   seed   
##  5 no    seed   
##  6 no    control
##  7 yes   seed   
##  8 no    control
##  9 no    control
## 10 no    seed   
## # ℹ 40 more rows
first_six_rows <- head(mythbusters_yawn)
first_six_rows %>% 
  sample_n(size = 6, replace = TRUE)
## # A tibble: 6 × 3
##    subj group   yawn 
##   <int> <chr>   <chr>
## 1     6 control no   
## 2     4 seed    yes  
## 3     1 seed    yes  
## 4     6 control no   
## 5     4 seed    yes  
## 6     2 control yes
mythbusters_yawn %>% 
  specify(formula = yawn ~ group, success = "yes") %>% 
  generate(reps = 1000, type = "bootstrap")
## Response: yawn (factor)
## Explanatory: group (factor)
## # A tibble: 50,000 × 3
## # Groups:   replicate [1,000]
##    replicate yawn  group  
##        <int> <fct> <fct>  
##  1         1 no    seed   
##  2         1 no    seed   
##  3         1 yes   seed   
##  4         1 no    control
##  5         1 no    seed   
##  6         1 no    control
##  7         1 no    seed   
##  8         1 yes   seed   
##  9         1 no    seed   
## 10         1 no    control
## # ℹ 49,990 more rows
bootstrap_distribution_yawning <- mythbusters_yawn %>% 
  specify(formula = yawn ~ group, success = "yes") %>% 
  generate(reps = 1000, type = "bootstrap") %>% 
  calculate(stat = "diff in props", order = c("seed", "control"))
bootstrap_distribution_yawning
## Response: yawn (factor)
## Explanatory: group (factor)
## # A tibble: 1,000 × 2
##    replicate    stat
##        <int>   <dbl>
##  1         1  0.246 
##  2         2 -0.0175
##  3         3  0.117 
##  4         4 -0.314 
##  5         5 -0.0696
##  6         6  0.133 
##  7         7  0.1   
##  8         8  0.05  
##  9         9  0.0476
## 10        10 -0.146 
## # ℹ 990 more rows
visualize(bootstrap_distribution_yawning) +
  geom_vline(xintercept = 0)

bootstrap_distribution_yawning %>% 
  get_confidence_interval(type = "percentile", level = 0.95)
## # A tibble: 1 × 2
##   lower_ci upper_ci
##      <dbl>    <dbl>
## 1   -0.238    0.305
obs_diff_in_props <- mythbusters_yawn %>% 
  specify(formula = yawn ~ group, success = "yes") %>% 
  # generate(reps = 1000, type = "bootstrap") %>% 
  calculate(stat = "diff in props", order = c("seed", "control"))
obs_diff_in_props
## Response: yawn (factor)
## Explanatory: group (factor)
## # A tibble: 1 × 1
##     stat
##    <dbl>
## 1 0.0441
myth_ci_se <- bootstrap_distribution_yawning %>% 
  get_confidence_interval(type = "percentile", level = 0.95)
myth_ci_se
## # A tibble: 1 × 2
##   lower_ci upper_ci
##      <dbl>    <dbl>
## 1   -0.238    0.305

Demographics Data

# since the data is in a CSV file, we will use the read_csv() function to load the data
demo_df <- read_csv("demographics_survey_data.csv") 
# as usual, it is good to take a look at your data
# for instance, looking for numeric or categorical data
str(demo_df, give.attr = FALSE)
## spc_tbl_ [281 × 131] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ enrolled            : chr [1:281] "Bio 18" "Bio 18" "Bio 18" "Bio 18" ...
##  $ home                : chr [1:281] "Central California" "Central California" "Southern California" "Central California" ...
##  $ ethnicity           : chr [1:281] "Hispanic or Latino,White or Caucausian" "Hispanic or Latino" "Hispanic or Latino" "Hispanic or Latino,White or Caucausian" ...
##  $ classStanding       : chr [1:281] "Senior" "Junior" "Sophomore" "Junior" ...
##  $ unitsCompleted      : num [1:281] 80 NA NA 104 70 NA 0 0 0 0 ...
##  $ unitsThisSemester   : num [1:281] NA 16 15 16 13 13 16 12 12 12 ...
##  $ GPA                 : num [1:281] NA NA NA NA NA NA NA NA NA NA ...
##  $ statsBefore         : chr [1:281] NA "Yes: Math 15 at UC Merced" "No" "Yes: Math 15 at UC Merced" ...
##  $ retakingWithDerek   : chr [1:281] NA "No" "No" "No" ...
##  $ learningStyle       : chr [1:281] NA "Visual" "Visual" "Visual" ...
##  $ hoursStudying       : num [1:281] NA 4 NA 2 8 50 30 5 24 NA ...
##  $ birthdayMonth       : chr [1:281] NA "December" "November" "May" ...
##  $ height              : num [1:281] NA 67 63 61 70 ...
##  $ shoeSize            : num [1:281] NA 7.5 7.5 5 11 8 9 12 10.5 9 ...
##  $ weight              : num [1:281] NA 170 NA 115 160 120 167 204 170 130 ...
##  $ calories            : num [1:281] NA 1800 NA NA 2500 600 2000 1780 2000 1500 ...
##  $ exercise            : num [1:281] NA 1 NA 8 10 NA 6 21 5 3 ...
##  $ sodas               : num [1:281] NA 2 1 0 0 0 2 0 0 1 ...
##  $ alcohol             : num [1:281] NA 0 NA NA 0 0 0 14 NA 0 ...
##  $ sleep               : num [1:281] NA 7 NA 6 5 6 5 8 7.5 7 ...
##  $ socialMedia         : chr [1:281] NA "Instagram" "Instagram,LinkedIn" "Instagram" ...
##  $ smartPhones         : chr [1:281] NA "iPhone" "iPhone" "iPhone" ...
##  $ baseball            : chr [1:281] NA "New York Mets,New York Yankees,San Franciso Giants" "Los Angeles Dodgers" NA ...
##  $ football            : chr [1:281] NA "San Francisco 49ers" NA NA ...
##  $ basketball          : chr [1:281] NA "Los Angeles Lakers" NA NA ...
##  $ hockey              : chr [1:281] NA "Los Angeles Kings" NA NA ...
##  $ politics            : num [1:281] NA 30 5 35 NA 50 50 20 50 25 ...
##  $ religious           : num [1:281] NA 50 50 50 90 0 30 60 0 30 ...
##  $ Kinsey              : chr [1:281] NA "3" "0" "0" ...
##  $ livingSituation     : chr [1:281] NA "off-campus apartment in Merced" "dormitory" "dormitory" ...
##  $ roommates           : num [1:281] NA 3 2 1 4 4 6 3 0 1 ...
##  $ roommatesLiked      : num [1:281] NA 3 2 1 4 NA 1 3 NA 1 ...
##  $ hoursWorking        : num [1:281] NA 0 NA NA 0 0 15 0 0 10 ...
##  $ happinessUCM        : num [1:281] NA NA 80 100 80 90 30 40 50 50 ...
##  $ happinessMercedCity : num [1:281] NA 50 80 30 80 60 30 10 40 35 ...
##  $ transportation      : chr [1:281] NA "bus" "bus" "car" ...
##  $ counselors          : num [1:281] NA 80 95 99 100 NA 60 80 30 NA ...
##  $ anxiousThisClass    : num [1:281] NA 60 60 2 65 50 70 0 0 NA ...
##  $ highSchoolType      : chr [1:281] NA "public school" "charter school" "public school" ...
##  $ anxiousTransitition : num [1:281] NA 80 0 10 70 50 50 80 50 60 ...
##  $ drugs               : chr [1:281] NA "Yes" NA "No" ...
##  $ diningCommons       : num [1:281] NA 85 20 35 0 NA 0 5 60 10 ...
##  $ diningOptions       : chr [1:281] NA "Bookstore\\, Marketplace\\, and/or the Lantern" "dining commons,Bookstore\\, Marketplace\\, and/or the Lantern,restaurant vendors" "dining commons,restaurant vendors" ...
##  $ enrolled2           : chr [1:281] "Bio 18" "Bio 18" NA "Bio 18" ...
##  $ UCMtopChoice        : chr [1:281] "No" "No" "No" "Yes" ...
##  $ parties             : chr [1:281] "Yes" "No" "Yes" "No" ...
##  $ firstGeneration     : chr [1:281] "Yes" "Yes" "Yes" "Yes" ...
##  $ languages           : num [1:281] 3 2 2.2 1.5 1.3 2 2.2 2 2 3.5 ...
##  $ parentsMarried      : chr [1:281] "Yes" "No" "No" "Yes" ...
##  $ homesickness        : num [1:281] 0 80 50 60 9 60 0 0 10 100 ...
##  $ safteyCampus        : num [1:281] 50 80 80 80 85 70 100 0 40 90 ...
##  $ makingFriends       : num [1:281] 100 50 50 75 70 90 50 0 65 70 ...
##  $ GreekLife           : chr [1:281] "No" "No" "No" "No" ...
##  $ campusSports        : chr [1:281] "No" "No" "No" "No" ...
##  $ anxiousOfficeHours  : num [1:281] 0 80 40 50 40 40 50 50 20 90 ...
##  $ studyGroups         : num [1:281] 100 60 80 100 65 60 50 100 70 70 ...
##  $ allowance           : num [1:281] 0 0 0 100 200 300 0 0 0 500 ...
##  $ familyIncome        : num [1:281] 100000 20000 32000 80000 65000 250000 13000 35000 NA 40000 ...
##  $ budgetHousing       : num [1:281] 40 70 20 40 40 50 25 100 100 20 ...
##  $ budgetFood          : num [1:281] 30 15 20 20 10 30 25 30 30 7 ...
##  $ budgetTransportation: num [1:281] 50 10 0 20 20 20 10 40 10 3 ...
##  $ budgetEducation     : num [1:281] 10 5 60 20 5 30 10 30 30 70 ...
##  $ retirementSavings   : chr [1:281] "Yes" "Yes" "No" "Yes" ...
##  $ happiness           : num [1:281] 80 50 50 70 90 85 80 100 70 70 ...
##  $ intelligence        : num [1:281] 80 60 60 89 75 80 80 69 0 75 ...
##  $ attractiveness      : num [1:281] 30 50 50 79 65 60 70 20 0 70 ...
##  $ transferring        : chr [1:281] "Yes" "No" "No" "No" ...
##  $ relationshipStatus  : chr [1:281] "Single" "Single" "Single" "Single" ...
##  $ favoriteColor       : chr [1:281] "White and red" "orange" "blue" "blue" ...
##  $ favoriteNumber      : num [1:281] 12 2 8 3 23 8 7 7 12 9 ...
##  $ happinessUCMcourses : num [1:281] 100 90 85 55 65 80 85 100 50 70 ...
##  $ showering           : num [1:281] 5 5 7 7 14 4 9 8 6 7 ...
##  $ brushingTeeth       : num [1:281] 20 7 12 14 14 14 14 20 7 14 ...
##  $ flossing            : num [1:281] 10 2 4 14 7 14 14 0 0 0 ...
##  $ washingHair         : num [1:281] 5 2 7 7 4 4 9 30 6 6 ...
##  $ pets                : num [1:281] 0 0 0 0 0 0 1 0 0 0 ...
##  $ homeVisits          : chr [1:281] "holidays" "monthly" "holidays" "weekly" ...
##  $ UCMfinanialAid      : num [1:281] 60 90 100 85 99 100 100 0 90 90 ...
##  $ careerConfidence    : num [1:281] 90 70 50 78 85 75 100 100 70 50 ...
##  $ siblings            : num [1:281] 1 1 3 1 1 1 6 1 5 3 ...
##  $ favoriteTV          : chr [1:281] "Greys anatomy" "Rick and Morty" "The Walking Dead" "Greys Anatomy" ...
##  $ favoriteMovie       : chr [1:281] "The lion king" "Lovely Bones" NA "Crazy Rich Asians" ...
##  $ loneliness          : num [1:281] 80 90 50 25 0 75 50 0 69 90 ...
##  $ organized           : num [1:281] 70 100 80 49 95 100 70 50 50 80 ...
##  $ breakfast           : chr [1:281] "No" "No" "No" "Yes" ...
##  $ snacks              : num [1:281] 3 2 3 3 5 3 5 3 5 4 ...
##  $ mandms              : num [1:281] 50 50 NA 100 23 150 0 300 370 52 ...
##  $ GPAhighSchool       : num [1:281] 3.9 3.5 NA 3.9 3.78 4 2.5 2 NA 3.7 ...
##  $ SATscore            : num [1:281] 600 1000 NA 1080 1430 1380 0 300 NA 0 ...
##  $ GreekLifeFuture     : chr [1:281] "I do not plan to join a fraternity nor sorority" "I do not plan to join a fraternity nor sorority" NA "I do not plan to join a fraternity nor sorority" ...
##  $ campusGroups        : num [1:281] 50 0 30 57 0 40 50 50 0 50 ...
##  $ procrastinator      : chr [1:281] "No" "Yes" "Yes" "Yes" ...
##  $ enrolled3           : chr [1:281] "Bio 18" "Bio 18" "Bio 18" "Bio 18" ...
##  $ computerAccess      : chr [1:281] "Laptop computer" "Laptop computer,Tablet" "Laptop computer" "Laptop computer" ...
##  $ reddit              : chr [1:281] "No" "No" "No" "No" ...
##  $ friends             : num [1:281] 6 5 8 3 5 10 10 22 1 10 ...
##  $ musicStudying       : chr [1:281] "Yes" "No" "No" "Yes" ...
##  $ thisClassCost       : num [1:281] 2785 0 37.2 500 70 ...
##  $ BachelorsUCM        : chr [1:281] "I plan on earning a bachelor's degree at UC Merced" "I plan on earning a bachelor's degree at UC Merced" "I plan on earning a bachelor's degree at UC Merced" "I plan on earning a bachelor's degree at UC Merced" ...
##   [list output truncated]

Example: How often do you brush your teeth per week?

# sample statistics
demo_df %>%
  summarize(count = sum(!is.na(brushingTeeth)),
            xbar = mean(brushingTeeth, na.rm = TRUE),
            med  = median(brushingTeeth, na.rm = TRUE),
            s    = sd(brushingTeeth, na.rm = TRUE),
            missing = sum(is.na(brushingTeeth)))
## # A tibble: 1 × 5
##   count  xbar   med     s missing
##   <int> <dbl> <dbl> <dbl>   <int>
## 1   275  12.0    14  3.54       6
# bootstrap distribution
bootstrap_distribution <- demo_df %>%
  specify(response = brushingTeeth) %>%
  generate(reps = 1337, type = "bootstrap") %>%
  calculate(stat = "mean")
## Warning: Removed 6 rows containing missing values.
# form a 95% confidence interval
bootstrap_distribution %>%
  get_confidence_interval(level = 0.95, type = "percentile")
## # A tibble: 1 × 2
##   lower_ci upper_ci
##      <dbl>    <dbl>
## 1     11.5     12.4
# visualize the confidence interval and sampling distribution
SE_CI <- bootstrap_distribution %>%
  get_ci(level = 0.95, type = "percentile")

visualize(bootstrap_distribution) +
  shade_ci(endpoints = SE_CI, color = "#DAA900", fill = "#002856")

Exercises

  1. Using the GPA numerical data,
  1. compute sample statistics

  2. compute a 95% confidence interval, and describe the result in a complete sentence.

  3. produce a visual of the sampling distribution and the confidence interval.

  1. Using the hoursStudying numerical data,
  1. compute sample statistics

  2. compute a 95% confidence interval, and describe the result in a complete sentence.

  3. produce a visual of the sampling distribution and the confidence interval.

  1. Using one more column of numerical data of your choosing,
  1. compute sample statistics

  2. compute a 95% confidence interval, and describe the result in a complete sentence.

  3. produce a visual of the sampling distribution and the confidence interval.